


***** info on matched Facebook-Twitter items
cd "max cosine files"
use "max_cosine_1.dta", clear

forval x = 2/31 {
    append using max_cosine_`x' 
}
cd ..
save "max_cosine_temp.dta", replace


***** topics
import delimited "topics_by_item18.csv", clear

rename v1 id

forval i = 2/19 {
    rename v`i' topic`=`i'-1'
    }

local vars " topic1 topic2 topic3 topic4 topic5 topic6 topic7 topic8 topic9 topic10 topic11 topic12 topic13 topic14 topic15 topic16 topic17 topic18 "
egen m2= rowmax(`vars')
gen dominant_topic = ""

foreach var of local vars { 
replace dominant_topic = "`var'" if m2 == `var'
}
drop m2

gen dominant_name = ""
replace dominant_name = "accidents" if dominant_topic == "topic1"
replace dominant_name = "refugee crisis" if dominant_topic == "topic2"
replace dominant_name = "crime investigations" if dominant_topic == "topic3"
replace dominant_name = "traffic news" if dominant_topic == "topic4"
replace dominant_name = "weather" if dominant_topic == "topic5"
replace dominant_name = "tech news" if dominant_topic == "topic6"
replace dominant_name = "court news" if dominant_topic == "topic7"
replace dominant_name = "panorama" if dominant_topic == "topic8"
replace dominant_name = "crime comitted by refugees" if dominant_topic == "topic9"
replace dominant_name = "foreign news" if dominant_topic == "topic10"
replace dominant_name = "business" if dominant_topic == "topic11"
replace dominant_name = "economy" if dominant_topic == "topic12"
replace dominant_name = "politics" if dominant_topic == "topic13"
replace dominant_name = "call to actions" if dominant_topic == "topic14"
replace dominant_name = "sports" if dominant_topic == "topic15"
replace dominant_name = "corporate crime" if dominant_topic == "topic16"
replace dominant_name = "outlets' own matters" if dominant_topic == "topic17"
replace dominant_name = "highlights of print edition" if dominant_topic == "topic18"

drop dominant_topic topic*


***** merge with tweets / posts
merge 1:1 id using "fb tw micro data.dta"
replace dominant_name = "empty" if _merge == 2
drop _merge 


***** text stats
merge 1:1 id using "text_stats.dta"
drop _merge doc_id

replace start_w_was = start_w_was - 1
replace start_w_wer = start_w_wer - 1
replace start_w_wo = start_w_wo - 1
replace start_w_wann = start_w_wann - 1
replace start_w_warum = start_w_warum - 1
replace start_w_wie = start_w_wie - 1

gen start_all = 0
replace start_all = 1 if start_w_was == 1 | start_w_wer == 1 | start_w_wo == 1 | ///
 start_w_wann == 1 | start_w_warum == 1 | start_w_wie == 1
drop start_w_was start_w_wer start_w_wo start_w_wann start_w_warum start_w_wie

 
***** matched tweets/posts
merge 1:1 id using "max_cosine_temp.dta"
drop _merge
replace max_cosine = 0 if missing(max_cosine)
erase "max_cosine_temp.dta"

save "fb tw micro data w vars.dta", replace

gen n = 1

* create overall number of items (= posts or tweets) per outlet/platform/day
* create lagged sum of likes/shares per outlet/platform/day
collapse (sum) items=n (mean) avg_likes=likes avg_shares=shares, by(date outlet_platform_id)

tsset outlet_platform_id date

gen l1_avg_likes = l1.avg_likes
gen l2_avg_likes = l2.avg_likes
gen l3_avg_likes = l3.avg_likes
gen l4_avg_likes = l4.avg_likes
gen l5_avg_likes = l5.avg_likes
gen l6_avg_likes = l6.avg_likes
gen l7_avg_likes = l7.avg_likes

gen l1_avg_shares = l1.avg_shares
gen l2_avg_shares = l2.avg_shares
gen l3_avg_shares = l3.avg_shares
gen l4_avg_shares = l4.avg_shares
gen l5_avg_shares = l5.avg_shares
gen l6_avg_shares = l6.avg_shares
gen l7_avg_shares = l7.avg_shares

drop avg_likes avg_shares

bysort outlet_platform_id: gen outlet_trend = _n
gen outlet_trend_sq = outlet_trend*outlet_trend
gen outlet_trend_tri = outlet_trend*outlet_trend*outlet_trend

save "temp.dta", replace

use "fb tw micro data w vars.dta", clear

merge m:1 date outlet_platform_id using "temp.dta"
drop _merge
erase "temp.dta"

* days until/since algorithm change 17th of March
gen rel_date = date - 20530

* national vs. regional outlets
gen national = 0
replace national = 1 if outlet == "faz" | outlet == "focus" | outlet == "fr" | ///
 outlet == "handelsblatt" | outlet == "spiegel" | outlet == "sz" | outlet == "taz" | ///
  outlet == "welt" | outlet == "welt kompakt" | outlet == "zeit"

* tabloid vs. quality media
gen tabloid = 0
replace tabloid = 1 if outlet == "bz berlin" | outlet == "express" | outlet == "mopo hh"


* log likes/shares to reduce effect of outliers and avoid having to estimate negative binomial models
gen log_likes = log(likes + 1)
gen log_shares = log(shares + 1)
gen asinh_likes = asinh(likes)
gen asinh_shares = asinh(shares)

drop if missing(items) | missing(l1_avg_likes) | missing(l2_avg_likes) | missing(l3_avg_likes) ///
 | missing(l4_avg_likes) | missing(l5_avg_likes) | missing(l6_avg_likes) | missing(l7_avg_likes) ///
 | missing(l1_avg_shares) | missing(l2_avg_shares) | missing(l3_avg_shares) ///
 | missing(l4_avg_shares) | missing(l5_avg_shares) | missing(l6_avg_shares) ///
 | missing(l7_avg_shares) | missing(hour) | missing(start_all) | missing(number_of_words) ///
 | missing(mean_word_length) | missing(questionmarks) | missing(exclamationmarks) ///
 | missing(share_neg_words) | missing(share_pos_words)

save "fb tw micro data w vars.dta", replace


**** include number of Twitter followers
import excel "followers twitter.xlsx", firstrow clear
drop if missing(outlet)
drop username Followers
 
merge 1:m outlet using "fb tw micro data w vars.dta"
drop _merge 
 
keep if rel_date >= -30 & rel_date <= 30

* Quality index Wellbrock (2011)
gen quality = .
replace quality = 7.26 if outlet == "berliner zeitung"
replace quality = 5.88 if outlet == "berliner mopo"
replace quality = 5 if outlet == "bz berlin"
replace quality = 4.67 if outlet == "express"
replace quality = 8.26 if outlet == "faz"
replace quality = 6.7 if outlet == "freie presse"
replace quality = 7.08 if outlet == "fr"
replace quality = 6.96 if outlet == "abendblatt hh"
replace quality = 7.16 if outlet == "handelsblatt"
replace quality = 6.8 if outlet == "ksta"
replace quality = 7.39 if outlet == "lvz"
replace quality = 5 if outlet == "mopo hh"
replace quality = 6.63 if outlet == "main post"
replace quality = 6.92 if outlet == "neue westf"
replace quality = 6.30 if outlet == "nordbayern"
replace quality = 6.87 if outlet == "nordwest zeitung"
replace quality = 7.67 if outlet == "noz"
replace quality = 6.85 if outlet == "passauer neue presse"
replace quality = 7 if outlet == "rp online"
replace quality = 6.85 if outlet == "südkurier"
replace quality = 8.45 if outlet == "spon"
replace quality = 7.53 if outlet == "stuttgarter zeitung"
replace quality = 6.47 if outlet == "schwäbische"
replace quality = 8.35 if outlet == "sz"
replace quality = 6.81 if outlet == "taz"
replace quality = 8 if outlet == "tagesspiegel"
replace quality = 7 if outlet == "welt"
replace quality = 8.03 if outlet == "zeitonline"
replace quality = 8.38 if outlet == "zeit"



order id platform twitter outlet outlet_platform_id date after_march16 rel_date ///
 text likes shares log_likes log_shares asinh_likes asinh_shares hour ///
 items start_all number_of_words mean_word_length questionmarks exclamationmarks ///
 share_neg_words share_pos_words national tabloid quality pre_switch_tw_foll ///
 max_cosine outlet_trend outlet_trend_sq outlet_trend_tri

label variable id "tweet/post ID"
label variable platform "platform"
label variable twitter "1 if Twitter, 0 if Facebook"
label variable outlet "outlet label"
label variable outlet_platform_id "outlet-platform ID"
label variable date "date of publication"
label variable after_march16 "1 if after Mar 16, 0 if not"
label variable rel_date "number of days relative to Mar 17"
label variable text "tweet or post text"
label variable likes "number of likes"
label variable shares "number of shares"
label variable log_likes "log (1 + number of likes)"
label variable log_shares "log (1 + number of shares)"
label variable asinh_likes "inverse hyperbolic sine of number of likes"
label variable asinh_shares "inverse hyperbolic sine of number of shares"
label variable hour "hour of publication"
label variable items "total number of tweets / posts per outlet-day"
label variable start_all "1 if post starts with question word, 0 if not"
label variable number_of_words "number of words of tweet / post text"
label variable mean_word_length "mean word length (characters)"
label variable questionmarks "number of question marks"
label variable exclamationmarks "number of exclamation marks"
label variable share_neg_words "share of negative words"
label variable share_pos_words "share of positive words"
label variable national "1 if national outlet, 0 if regional outlet"
label variable tabloid "1 if tabloid, 0 if broadsheet"
label variable quality "Wellbrock (2011) quality index"
label variable pre_switch_tw_foll "number of Twitter followers before introduction of algorithm"
label variable max_cosine "maximum cosine similarity"
label variable outlet_trend "outlet trend (linear)"
label variable outlet_trend_sq "outlet trend (quadratic)"
label variable outlet_trend_tri "outlet trend (cubic)"
label variable dominant_name "label of dominant news topic"
label variable l1_avg_likes "average number of likes on day t-1"
label variable l2_avg_likes "average number of likes on day t-2"
label variable l3_avg_likes "average number of likes on day t-3"
label variable l4_avg_likes "average number of likes on day t-4"
label variable l5_avg_likes "average number of likes on day t-5"
label variable l6_avg_likes "average number of likes on day t-6"
label variable l7_avg_likes "average number of likes on day t-7"
label variable l1_avg_shares "average number of shares on day t-1"
label variable l2_avg_shares "average number of shares on day t-2"
label variable l3_avg_shares "average number of shares on day t-3"
label variable l4_avg_shares "average number of shares on day t-4"
label variable l5_avg_shares "average number of shares on day t-5"
label variable l6_avg_shares "average number of shares on day t-6"
label variable l7_avg_shares "average number of shares on day t-7"

erase "fb tw micro data w vars.dta"
cd ..
save "fb tw micro data w vars.dta", replace





















